Dataset downloaded from mgandal’s github repository.
# Load csvs
datExpr = read.csv('./../../../initialExperiments/FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../../../initialExperiments/FirstYearReview/Data/Gandal/RNAseq_ASD_datMeta.csv')
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
# Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers,
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
datMeta$Batch = gsub('/', '.', datMeta$RNAExtractionBatch) %>% as.factor
# Transform Diagnosis into a factor variable
datMeta$Diagnosis_ = factor(datMeta$Diagnosis_, levels=c('CTL','ASD'))
# GO Neuronal annotations
GO_annotations = read.csv('./../../../initialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
rm(GO_annotations)
Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 63682 genes from 88 samples belonging to 41 different subjects."
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
rm(getinfo, mart)
# 1. Filter genes with start or end position missing
to_keep = !is.na(datGenes$length)
print(paste0('Names of the \'Genes\' removed: ', paste(rownames(datExpr)[!to_keep], collapse=', ')))
## [1] "Names of the 'Genes' removed: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique"
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
# 2. Filter genes that do not encode any protein
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
# 3. Filter genes with low expression levels
# 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Filtered dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr),
' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Filtered dataset includes 19426 genes from 88 samples belonging to 41 different subjects."
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "967 SFARI genes remaining"
# Save datasets before level of expression filtering
datExpr_original = datExpr
datGenes_original = datGenes
datMeta_original = datMeta
Filtering outlier samples (there aren’t many)
Creating DESeq object and normalising using vst transformation
#thresholds = c(0.5, 1, seq(2,10), 12, 15, 17, 20, 30, 40, 50, 60, 70, 80, 100)
thresholds = c(0, 0.1, seq(0.2, 2, 0.2), 2.5, 3, 5, 7.5, 10)
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
cat(paste0('\n\nFiltering with threshold: ', threshold,'\n'))
to_keep = rowMeans(datExpr)>threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = abs(z.ku)<2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
cat(paste0('Removing ', sum(!to_keep), ' samples\n'))
rm(absadj, netsummary, ku, z.ku, to_keep)
# Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
} else {
new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
##
##
## Filtering with threshold: 0
## alpha: 1.000000
## Removing 7 samples
##
##
## Filtering with threshold: 0.1
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.4
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.6
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.8
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.4
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.6
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.8
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 2.5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 3
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 7.5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 10
## alpha: 1.000000
## Removing 6 samples
rm(new_entries)
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<6] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=6]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/20)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
Note: Plotting all of the genes is too heavy, so I’m going to filter most of the genes with the highest levels of expression because we care about the behaviour of the genes with low levels
The weird behaviour from the left tail disappears at around 1.6
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())
rm(to_keep_1, to_keep_2, plot_data)
The trend line doesn’t actually flatten out completely with any threshold, it takes really high thresholds to flatten out the left curve of the trend line, but you lose a lot of genes to achieve this. Maybe the trend line is too sensitive and is not a good representation to select the filtering threshold
ggplotly(mean_vs_sd_data %>% ggplot(aes(Mean, SD)) +
geom_line(stat='smooth', method='loess', se=FALSE, alpha=0.5,
aes(group=threshold, color=threshold)) +
ylim(min(mean_vs_sd_data$SD), max(mean_vs_sd_data$SD)) +
ylab('Remaining Genes') + theme_minimal() +
ggtitle('Trend lines for different filtering thresholds'))
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally
ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))
The scatter plot and the trend lines seem to give different results, the first one seems to point to a low threshold as the optimum and the second one at a much higher threshold.
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.23 dendextend_1.12.0
## [3] vsn_3.52.0 WGCNA_1.68
## [5] fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [7] sva_3.32.1 genefilter_1.66.0
## [9] mgcv_1.8-31 nlme_3.1-143
## [11] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [13] DelayedArray_0.10.0 BiocParallel_1.18.1
## [15] matrixStats_0.54.0 Biobase_2.44.0
## [17] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0
## [19] IRanges_2.18.2 S4Vectors_0.22.0
## [21] BiocGenerics_0.30.0 biomaRt_2.40.4
## [23] ggExtra_0.9 GGally_1.4.0
## [25] gridExtra_2.3 viridis_0.5.1
## [27] viridisLite_0.3.0 RColorBrewer_1.1-2
## [29] plotlyutils_0.0.0.9000 plotly_4.9.0
## [31] glue_1.3.1 reshape2_1.4.3
## [33] forcats_0.4.0 stringr_1.4.0
## [35] dplyr_0.8.2 purrr_0.3.3
## [37] readr_1.3.1 tidyr_0.8.3
## [39] tibble_2.1.3 ggplot2_3.2.0
## [41] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.4 Hmisc_4.2-0
## [4] plyr_1.8.4 lazyeval_0.2.2 splines_3.6.2
## [7] crosstalk_1.0.0 robust_0.4-18.1 digest_0.6.19
## [10] foreach_1.4.7 htmltools_0.3.6 GO.db_3.8.2
## [13] magrittr_1.5 checkmate_1.9.4 memoise_1.1.0
## [16] fit.models_0.5-14 cluster_2.1.0 doParallel_1.0.15
## [19] limma_3.40.6 annotate_1.62.0 modelr_0.1.5
## [22] prettyunits_1.0.2 colorspace_1.4-1 blob_1.2.0
## [25] rvest_0.3.4 rrcov_1.4-7 haven_2.1.1
## [28] xfun_0.8 crayon_1.3.4 RCurl_1.95-4.12
## [31] jsonlite_1.6 zeallot_0.1.0 impute_1.58.0
## [34] survival_3.1-8 iterators_1.0.12 gtable_0.3.0
## [37] zlibbioc_1.30.0 XVector_0.24.0 DEoptimR_1.0-8
## [40] scales_1.0.0 mvtnorm_1.0-11 DBI_1.0.0
## [43] miniUI_0.1.1.1 Rcpp_1.0.1 xtable_1.8-4
## [46] progress_1.2.2 htmlTable_1.13.1 foreign_0.8-74
## [49] bit_1.1-14 preprocessCore_1.46.0 Formula_1.2-3
## [52] htmlwidgets_1.3 httr_1.4.0 acepack_1.4.1
## [55] pkgconfig_2.0.2 reshape_0.8.8 XML_3.98-1.20
## [58] nnet_7.3-12 locfit_1.5-9.1 labeling_0.3
## [61] tidyselect_0.2.5 rlang_0.4.2 later_0.8.0
## [64] AnnotationDbi_1.46.1 munsell_0.5.0 cellranger_1.1.0
## [67] tools_3.6.2 cli_1.1.0 generics_0.0.2
## [70] RSQLite_2.1.2 broom_0.5.2 evaluate_0.14
## [73] yaml_2.2.0 bit64_0.9-7 robustbase_0.93-5
## [76] mime_0.7 xml2_1.2.2 compiler_3.6.2
## [79] rstudioapi_0.10 curl_3.3 affyio_1.54.0
## [82] geneplotter_1.62.0 pcaPP_1.9-73 stringi_1.4.3
## [85] lattice_0.20-38 Matrix_1.2-18 vctrs_0.2.0
## [88] pillar_1.4.2 BiocManager_1.30.4 data.table_1.12.2
## [91] bitops_1.0-6 httpuv_1.5.1 affy_1.62.0
## [94] R6_2.4.0 latticeExtra_0.6-28 promises_1.0.1
## [97] codetools_0.2-16 MASS_7.3-51.5 assertthat_0.2.1
## [100] withr_2.1.2 GenomeInfoDbData_1.2.1 hms_0.5.1
## [103] grid_3.6.2 rpart_4.1-15 rmarkdown_1.13
## [106] shiny_1.3.2 lubridate_1.7.4 base64enc_0.1-3